{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2-推荐系统"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "sns.set(context=\"notebook\", style=\"white\", palette=sns.color_palette(\"RdBu\"))\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import scipy.io as sio"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# load data and setting up"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "% Notes: X - num_movies (1682)  x num_features (10) matrix of movie features  \n",
    "%        Theta - num_users (943)  x num_features (10) matrix of user features  \n",
    "%        Y - num_movies x num_users matrix of user ratings of movies  \n",
    "%        R - num_movies x num_users matrix, where R(i, j) = 1 if the  \n",
    "%            i-th movie was rated by the j-th user  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1682, 943), (1682, 943))"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies_mat = sio.loadmat('./data/ex8_movies.mat')\n",
    "Y, R = movies_mat.get('Y'), movies_mat.get('R')\n",
    "\n",
    "Y.shape, R.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "m, u = Y.shape\n",
    "# m: how many movies\n",
    "# u: how many users\n",
    "\n",
    "n = 10  # how many features for a movie"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((943, 10), (1682, 10))"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "param_mat = sio.loadmat('./data/ex8_movieParams.mat')\n",
    "theta, X = param_mat.get('Theta'), param_mat.get('X')\n",
    "\n",
    "theta.shape, X.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# cost\n",
    "<img style=\"float: left;\" src=\"../img/rcmd_cost.png\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def serialize(X, theta):\n",
    "    \"\"\"serialize 2 matrix\n",
    "    \"\"\"\n",
    "    # X (movie, feature), (1682, 10): movie features\n",
    "    # theta (user, feature), (943, 10): user preference\n",
    "    return np.concatenate((X.ravel(), theta.ravel()))\n",
    "\n",
    "\n",
    "def deserialize(param, n_movie, n_user, n_features):\n",
    "    \"\"\"into ndarray of X(1682, 10), theta(943, 10)\"\"\"\n",
    "    return param[:n_movie * n_features].reshape(n_movie, n_features), \\\n",
    "           param[n_movie * n_features:].reshape(n_user, n_features)\n",
    "\n",
    "\n",
    "# recommendation fn\n",
    "def cost(param, Y, R, n_features):\n",
    "    \"\"\"compute cost for every r(i, j)=1\n",
    "    Args:\n",
    "        param: serialized X, theta\n",
    "        Y (movie, user), (1682, 943): (movie, user) rating\n",
    "        R (movie, user), (1682, 943): (movie, user) has rating\n",
    "    \"\"\"\n",
    "    # theta (user, feature), (943, 10): user preference\n",
    "    # X (movie, feature), (1682, 10): movie features\n",
    "    n_movie, n_user = Y.shape\n",
    "    X, theta = deserialize(param, n_movie, n_user, n_features)\n",
    "\n",
    "    inner = np.multiply(X @ theta.T - Y, R)\n",
    "\n",
    "    return np.power(inner, 2).sum() / 2\n",
    "\n",
    "\n",
    "def gradient(param, Y, R, n_features):\n",
    "    # theta (user, feature), (943, 10): user preference\n",
    "    # X (movie, feature), (1682, 10): movie features\n",
    "    n_movies, n_user = Y.shape\n",
    "    X, theta = deserialize(param, n_movies, n_user, n_features)\n",
    "\n",
    "    inner = np.multiply(X @ theta.T - Y, R)  # (1682, 943)\n",
    "\n",
    "    # X_grad (1682, 10)\n",
    "    X_grad = inner @ theta\n",
    "\n",
    "    # theta_grad (943, 10)\n",
    "    theta_grad = inner.T @ X\n",
    "\n",
    "    # roll them together and return\n",
    "    return serialize(X_grad, theta_grad)\n",
    "\n",
    "\n",
    "def regularized_cost(param, Y, R, n_features, l=1):\n",
    "    reg_term = np.power(param, 2).sum() * (l / 2)\n",
    "\n",
    "    return cost(param, Y, R, n_features) + reg_term\n",
    "\n",
    "\n",
    "def regularized_gradient(param, Y, R, n_features, l=1):\n",
    "    grad = gradient(param, Y, R, n_features)\n",
    "    reg_term = l * param\n",
    "\n",
    "    return grad + reg_term\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "22.224603725685675"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use subset of data to calculate the cost as in pdf...\n",
    "users = 4\n",
    "movies = 5\n",
    "features = 3\n",
    "\n",
    "X_sub = X[:movies, :features]\n",
    "theta_sub = theta[:users, :features]\n",
    "Y_sub = Y[:movies, :users]\n",
    "R_sub = R[:movies, :users]\n",
    "\n",
    "param_sub = serialize(X_sub, theta_sub)\n",
    "cost(param_sub, Y_sub, R_sub, features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "27918.64012454421"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "param = serialize(X, theta)  # total real params\n",
    "\n",
    "cost(serialize(X, theta), Y, R, 10)  # this is real total cost"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# gradient\n",
    "<img style=\"float: left;\" src=\"../img/rcmd_gradient.png\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "n_movie, n_user = Y.shape\n",
    "\n",
    "X_grad, theta_grad = deserialize(gradient(param, Y, R, 10),\n",
    "                                      n_movie, n_user, 10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img style=\"float: left;\" src=\"../img/rcmd_vectorized_grad.png\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "assert X_grad.shape == X.shape\n",
    "assert theta_grad.shape == theta.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# regularized cost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "31.344056244274221"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# in the ex8_confi.m, lambda = 1.5, and it's using sub data set\n",
    "regularized_cost(param_sub, Y_sub, R_sub, features, l=1.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "32520.682450229557"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "regularized_cost(param, Y, R, 10, l=1)  # total regularized cost"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# regularized gradient"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img style=\"float: left;\" src=\"../img/rcmd_reg_grad.png\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "n_movie, n_user = Y.shape\n",
    "\n",
    "X_grad, theta_grad = deserialize(regularized_gradient(param, Y, R, 10),\n",
    "                                                                n_movie, n_user, 10)\n",
    "\n",
    "assert X_grad.shape == X.shape\n",
    "assert theta_grad.shape == theta.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# parse `movie_id.txt`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "movie_list = []\n",
    "\n",
    "with open('./data/movie_ids.txt', encoding='latin-1') as f:\n",
    "    for line in f:\n",
    "        tokens = line.strip().split(' ')\n",
    "        movie_list.append(' '.join(tokens[1:]))\n",
    "\n",
    "movie_list = np.array(movie_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# reproduce my ratings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "ratings = np.zeros(1682)\n",
    "\n",
    "ratings[0] = 4\n",
    "ratings[6] = 3\n",
    "ratings[11] = 5\n",
    "ratings[53] = 4\n",
    "ratings[63] = 5\n",
    "ratings[65] = 3\n",
    "ratings[68] = 5\n",
    "ratings[97] = 2\n",
    "ratings[182] = 4\n",
    "ratings[225] = 5\n",
    "ratings[354] = 5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# prepare data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1682, 944)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y, R = movies_mat.get('Y'), movies_mat.get('R')\n",
    "\n",
    "\n",
    "Y = np.insert(Y, 0, ratings, axis=1)  # now I become user 0\n",
    "Y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1682, 944)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "R = np.insert(R, 0, ratings != 0, axis=1)\n",
    "R.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "n_features = 50\n",
    "n_movie, n_user = Y.shape\n",
    "l = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1682, 50), (944, 50))"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = np.random.standard_normal((n_movie, n_features))\n",
    "theta = np.random.standard_normal((n_user, n_features))\n",
    "\n",
    "X.shape, theta.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "param = serialize(X, theta)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "normalized ratings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4.6862111343939375e-17"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y_norm = Y - Y.mean()\n",
    "Y_norm.mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import scipy.optimize as opt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "res = opt.minimize(fun=regularized_cost,\n",
    "                   x0=param,\n",
    "                   args=(Y_norm, R, n_features, l),\n",
    "                   method='TNC',\n",
    "                   jac=regularized_gradient)\n",
    "#这里很慢"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     fun: 64721.497815072536\n",
       "     jac: array([ -1.35092039e-07,  -5.05980486e-08,  -7.61559766e-08, ...,\n",
       "        -2.97811034e-07,  -8.12815282e-08,  -1.47592943e-07])\n",
       " message: 'Converged (|f_n-f_(n-1)| ~= 0)'\n",
       "    nfev: 2300\n",
       "     nit: 74\n",
       "  status: 1\n",
       " success: True\n",
       "       x: array([ 0.152063  ,  0.05385871,  0.10072402, ..., -0.47800815,\n",
       "        0.19233806,  0.18563834])"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1682, 50), (944, 50))"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_trained, theta_trained = deserialize(res.x, n_movie, n_user, n_features)\n",
    "X_trained.shape, theta_trained.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "prediction = X_trained @ theta_trained.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "my_preds = prediction[:, 0] + Y.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1682,)"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "idx = np.argsort(my_preds)[::-1]  # Descending order\n",
    "idx.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 4.12534012,  4.04415049,  3.99324638,  3.91902741,  3.81691636,\n",
       "        3.81556276,  3.76602832,  3.76323164,  3.75906029,  3.75076642])"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# top ten idx\n",
    "my_preds[idx][:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Titanic (1997)\n",
      "Star Wars (1977)\n",
      "Shawshank Redemption, The (1994)\n",
      "Forrest Gump (1994)\n",
      "Raiders of the Lost Ark (1981)\n",
      "Braveheart (1995)\n",
      "Return of the Jedi (1983)\n",
      "Usual Suspects, The (1995)\n",
      "Godfather, The (1972)\n",
      "Schindler's List (1993)\n"
     ]
    }
   ],
   "source": [
    "for m in movie_list[idx][:10]:\n",
    "    print(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
